Topic Modelling and Clustering a RSS feed

Modules used:

Scikit-Learn
nltk
feedparser



In [1]:

    
import feedparser
import re
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
import string
from collections import Counter
stop_words = set(stopwords.words('english'))
stop_words.update(string.punctuation)



In [2]:

    
def getwords(html):
    '''
    Remove HTML, tokenize and lower the case
    '''
    txt = re.compile(r'<[^>]+>').sub('',html)
    word_list=[i.lower() for i in wordpunct_tokenize(txt) if i.lower() not in stop_words]
    return word_list



In [3]:

    
def getwordcounts(url):
    '''
    Returns list of blog posts
    '''
    d = feedparser.parse(url)
    wc = {}
    summary = []
    for e in d.entries:
        if 'summary' in e:
            summary.append(e.title + e.summary)
        else:
            summary.append(e.title + e.description)
    return summary



In [4]:

    
summary = getwordcounts('https://sethuiyer.wordpress.com/feed/atom/')



In [5]:

    
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=getwords,
                                 max_df=0.5,
                                 min_df=0.1,
                                 lowercase=True)
tfidf_model = vectorizer.fit_transform(summary)



In [6]:

    
import collections
km_model = KMeans(n_clusters=3)
km_model.fit(tfidf_model)
clustering = collections.defaultdict(list)
 
for idx, label in enumerate(km_model.labels_):
    clustering[label].append(idx)



In [7]:

    
from sklearn.decomposition import NMF
nmf = NMF(n_components=3, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf_model)
feature_names = vectorizer.get_feature_names()

def print_top_words(model, feature_names, n_top_words=2):
    topic_list=[]
    for topic_idx, topic in enumerate(model.components_):
        topic_list.append(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    return topic_list



In [8]:

    
topic_list=print_top_words(nmf, vectorizer.get_feature_names(), 2)



In [9]:

    
for i in range(3):
    print("Topic Name: ",topic_list[i])
    print("Documents in the cluster: ",clustering[i])
    print('----------------')









    



Topic Name:  data started
Documents in the cluster:  [1, 2]
----------------
Topic Name:  extension limitless
Documents in the cluster:  [3, 4]
----------------
Topic Name:  classification vector
Documents in the cluster:  [0, 5]
----------------